In [65]:
import networkx as nx
import custom_funcs as cf
import pandas as pd
import matplotlib.pyplot as plt
import dendropy

from Levenshtein import distance
from collections import defaultdict, Counter
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, AlignIO
from Bio.Align import MultipleSeqAlignment
from itertools import product

%load_ext autoreload
%autoreload 2

%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [2]:
G = nx.read_gpickle('20150902_all_ird Final Graph.pkl')
G = cf.clean_host_species_names(G)
G = cf.impute_reassortant_status(G)
G = cf.impute_weights(G)

In [3]:
G.nodes(data=True)[1286]


Out[3]:
('A/Tennessee/F2018A/2011',
 {'collection_date': Timestamp('2011-01-24 00:00:00'),
  'country': 'USA',
  'host_species': 'Human',
  'reassortant': False,
  'state': 'Tennessee',
  'subtype': 'H3N2'})

In [4]:
# Get all of the host species with TOL and BOLD links
hosts_with_coi = pd.read_csv('host_species.csv', index_col=0)
hosts_with_coi


Out[4]:
host_species TOL_species_name TOL_url sequence BOLD_url notes
0 Sparrow NaN NaN NaN NaN ambiguous term
1 American Green-Winged Teal Anas carolinensis http://tolweb.org/Anas_carolinensis/89249 TCTATACCTTATCTTCGGGGCATGAGCCGGAATAATTGGCACAGCA... http://boldsystems.org/index.php/Public_Record... NaN
2 Turkey Meleagris gallopavo http://tolweb.org/Meleagris/57202 GTGACTTTCATCAACCGATGATTATTTTCAACCAACCATAAAGATA... http://boldsystems.org/index.php/Public_Record... NaN
3 Semipalmated Sandpiper Calidris pusilla http://tolweb.org/Calidris_pusilla/90811 GTGACTTTTATCAACCGATGACTATTCTCAACCAACCACAAAGATA... http://boldsystems.org/index.php/Public_Record... NaN
4 Murre NaN NaN NaN NaN not available on TOL
5 Heron NaN NaN NaN NaN more than one record on TOL
6 Wood Duck Aix sponsa http://tolweb.org/Aix/89196 CTTGTACCTTATCTTCGGGGCATGAGCCGGAATAATTGGCACAGCA... http://boldsystems.org/index.php/Public_Record... NaN
7 Myna NaN NaN NaN NaN not available on TOL
8 Ostrich Struthio camelus http://tolweb.org/Struthio_camelus/26289 GTGACCTTCATTACTCGATGACTTTTTTCAACAAATCACAAAGACA... http://boldsystems.org/index.php/Public_Record... NaN
9 Ruddy Turnstone Arenaria interpres http://tolweb.org/Arenaria_interpres/90804 GTGACTTTTATCAACCGATGACTATTCTCAACCAACCACAAAGATA... http://boldsystems.org/index.php/Public_Record... NaN
10 Sanderling Calidris alba http://tolweb.org/Calidris_alba/90810 GTGACTTTTATCAACCGATGACTATTCTCAACCAACCACAAAGATA... http://boldsystems.org/index.php/Public_Record... NaN
11 Camel Camelus dromedarius http://tolweb.org/Camelus_dromedarius/30349 CTAAGCTTATTAATTCGTGCTGAATTGGGGCAGCCTGGGACATTGC... http://boldsystems.org/index.php/Public_Record... NaN
12 Redhead Aythya americana http://tolweb.org/Aythya_americana/89260 ATGGCACACCAAGCACACTCCTACCACATAGTAGACCCAAGCCCCT... http://boldsystems.org/index.php/Public_Record... NaN
13 Eagle NaN NaN NaN NaN ambiguous term
14 Mallard Anas platyrhynchos http://tolweb.org/Anas_platyrhynchos/89217 GGGGCATGAGCCGGAATAATTGGCACAGCACTCAGCCTACTGATCC... http://boldsystems.org/index.php/Public_Record... NaN
15 Swan NaN NaN NaN NaN ambiguous term
16 Great Crested Grebe Podiceps cristatus http://tolweb.org/Podiceps_cristatus/89462 TCTATACTTAATCTTTGGTGCATGAGCCGGCATAGTCGGCACCGCC... http://boldsystems.org/index.php/Public_Record... NaN
17 Herring Gull Larus smithsonianus http://tolweb.org/Larus_smithsonianus/129496 TAGGTACTGCCCTCAGCCTGCTTATCCGTGCAGAACTTGGCCAACC... http://boldsystems.org/index.php/Public_Record... also have european herring gull on TOL
18 Northern Shoveler Anas clypeata http://tolweb.org/Anas_clypeata/89233 CAAGACATTGGCACTCTATACCTTATCTTCGGGGCATGAGCCGGAA... http://boldsystems.org/index.php/Public_Record... NaN
19 Blue-Winged Teal Anas discors http://tolweb.org/Anas_discors/89228 TCTATACCTTATCTTCGGGGCATGAGCCGGAATAATTGGCACAGCA... http://boldsystems.org/index.php/Public_Record... NaN
20 Greater White-Fronted Goose Anser albifrons http://tolweb.org/Anser_albifrons/89141 GTGACCTTCATCAACCGATGACTATTTTCCACTAACCATAAGGATA... http://boldsystems.org/index.php/Public_Record... NaN
21 White-Rumped Sandpiper Calidris fuscicollis http://tolweb.org/Calidris_fuscicollis/90818 GTGACTTTTATTAATCGATGACTATTCTCAACCAACCACAAAGATA... http://boldsystems.org/index.php/Public_Record... NaN
22 Ring-Billed Gull Larus delawarensis http://tolweb.org/Larus_delawarensis/90604 GTGACCTTTATCAATCGATGATTATTTTCAACAAACCACAAAGATA... http://boldsystems.org/index.php/Public_Record... NaN
23 Parrot NaN NaN NaN NaN ambiguous term
24 Tundra Swan Cygnus columbianus http://tolweb.org/Cygnus_columbianus/89161 GTGACCTTCATCAACCGATGACTATTTTCCACTAACCATAAAGATA... http://boldsystems.org/index.php/Public_Record... NaN
25 Ring-Necked Duck Aythya collaris http://tolweb.org/Aythya_collaris/89267 CCTATATCTTATCTTTGGGGCATGAGCCGGAATAATCGGCACAGCA... http://boldsystems.org/index.php/Public_Record... NaN
26 Chicken Gallus gallus http://tolweb.org/Gallus/57162 ATCGTCACAGCCCATGCTTTCGTCATAATCTTCTTTATAGTTATAC... http://boldsystems.org/index.php/Public_Record... other terms found, but gallus gallus is the kn...
27 American Black Duck Anas rubripes http://tolweb.org/Anas_rubripes/89216 TATACCTTATCTTCGGGACATGAGCCGGAATAATTGGCACAGCACT... http://boldsystems.org/index.php/Public_Record... NaN
28 Whiskered Tern Chlidonias hybrida http://tolweb.org/Chlidonias_hybrida/90682 GTGACCTTCATCAACCGATGATTATTTTCAACAAACCACAAAGATA... http://boldsystems.org/index.php/Public_Record... NaN
29 Mink NaN NaN NaN NaN not available on TOL
... ... ... ... ... ... ...
122 Gadwall Anas strepera http://tolweb.org/Anas_strepera/89210 CTTATCTTCGGGGCATGGGCCGGAATAATTGGCACAGCACTCAGCC... http://boldsystems.org/index.php/Public_Record... NaN
123 Gull NaN NaN NaN NaN ambiguous term
124 Barnacle Goose Branta leucopsis http://tolweb.org/Branta_leucopsis/89153 CTCATCTTCGGAGCATGAGCAGGAATAGTCGGCACCGCACTCAGCC... http://boldsystems.org/index.php/Public_Record... NaN
125 Panda Ailuropoda melanoleuca https://en.wikipedia.org/wiki/Giant_panda ATGTTCATTAACCGATGACTGTTTTCCACCAACCACAAAGATATTG... http://boldsystems.org/index.php/Public_Record... species name found on Wikipedia
126 Condor NaN NaN NaN NaN ambiguous term
127 Unknown NaN NaN NaN NaN ambiguous term
128 Flycatcher NaN NaN NaN NaN ambiguous term
129 Waterfowl NaN NaN NaN NaN ambiguous term
130 Grey Heron Ardea cinerea http://tolweb.org/Ardea_cinerea/89637 ATCTTCGGAGCATGAGCCGGCATAATTGGAACCGCCCTAAGCCTAC... http://boldsystems.org/index.php/Public_Record... NaN
131 Grebe NaN NaN NaN NaN ambiguous term
132 Mallard-Black Duck Hybrid NaN NaN NaN NaN not available on TOL
133 Iceland Gull Larus glaucoides http://tolweb.org/Larus/90592 TCTTCGGCGCATGAGCTGGCATAGTAGGTACTGCCCTCAGCCTGCT... http://boldsystems.org/index.php/Public_Record... NaN
134 Shoveler NaN NaN NaN NaN ambiguous term
135 Snow Goose Chen caerulescens http://tolweb.org/Chen_caerulescens/89145 CCTATACCTCATNTTCGGGGCATGAGCAGGAATAGTCGGCACCGCA... http://boldsystems.org/index.php/Public_Record... NaN
136 Sea Mammal NaN NaN NaN NaN ambiguous term
137 Domestic Cat Felis catus http://tolweb.org/Felis_catus/123531 TCCGGGCCGAACTGGGCCAACCTGGTACACTACTAGGAGATGATCA... http://boldsystems.org/index.php/Public_Record... NaN
138 Red Knot Calidris canutus http://tolweb.org/Calidris_canutus/90809 TTTTCTCCAACCCACAAAGACATTGGCACCCTATACCTAATCTTCG... http://boldsystems.org/index.php/Public_Record... NaN
139 Spot-Billed Duck NaN NaN NaN NaN ambiguous term
140 Quail NaN NaN NaN NaN ambiguous term
141 Coot NaN NaN NaN NaN ambiguous term
142 Magpie Anseranas semipalmata http://tolweb.org/Anseranas_semipalmata/26294 GTGACCTTCATTAACCGCTGACTATTCTCAACTAACCATAAAGACA... http://boldsystems.org/index.php/Public_Record... NaN
143 Babbler Eupetes macrocerus http://tolweb.org/Eupetidae/79175 NaN NaN not available on BOLD
144 Guinea Fowl NaN NaN NaN NaN ambiguous term
145 Black Scoter Melanitta nigra http://tolweb.org/Melanitta_nigra/89281 CTTATCTACNCGGCATGAGCCGGAATAATTGGCACAGCACTCAGCC... http://boldsystems.org/index.php/Public_Record... NaN
146 Magpie Robin NaN NaN NaN NaN not available on TOL
147 Pigeon NaN NaN NaN NaN ambiguous term
148 Sharp-Tailed Sandpiper Calidris acuminata http://tolweb.org/Calidris_acuminata/90821 GTGACTTTCATCAACCGATGATTATTCTCAACCAACCACAAAGACA... http://boldsystems.org/index.php/Public_Record... NaN
149 Common Goldeneye Bucephala clangula http://tolweb.org/Bucephala_clangula/89286 TTCTCCAACCACAAAGACATTGGCACCCTATATCTTATCTTCGGAG... http://boldsystems.org/index.php/Public_Record... NaN
150 Chukar NaN NaN NaN NaN ambiguous term
151 Garganey Anas querquedula http://tolweb.org/Anas_querquedula/89246 TCTATACCTTATCTTCGGGGCATGAGCCGGAATAATTGGCACAGCA... http://boldsystems.org/index.php/Public_Record... NaN

152 rows × 6 columns


In [5]:
# Compile COI sequences into a FASTA file to do multiple sequence alignment.

coi_sequences = []
for r, d in hosts_with_coi.iterrows():
    if not pd.isnull(d['sequence']):
        seq = Seq(d['sequence'])
        seqrecord = SeqRecord(seq, id='{0}.{1}'.format(d['host_species'].replace(' ', '_'), d['TOL_species_name'].replace(' ', '_')))
        coi_sequences.append(seqrecord)
SeqIO.write(coi_sequences, 'host_coi_unaligned.fasta', 'fasta')


Out[5]:
74

In [6]:
# After aligning using clustal omega (default parameters), load back the alignment.
coi_aligned = AlignIO.read('host_coi_aligned.fasta', 'fasta')
coi_aligned


Out[6]:
<<class 'Bio.Align.MultipleSeqAlignment'> instance (74 records of length 1638, SingleLetterAlphabet()) at 7f6e604a9710>

In [7]:
# To identify where to trim the alignment, look at the number of gaps.

num_gaps = dict()
for i in range(coi_aligned.get_alignment_length()):
    num_gaps[i] = Counter(coi_aligned[:,i])['-']
plt.plot(list(num_gaps.keys()), list(num_gaps.values()))
plt.xlabel('position in alignment')
plt.ylabel('number of gap characters')


Out[7]:
<matplotlib.text.Text at 0x7f6e602967f0>

In [8]:
# Given this distribution of gaps, we will use a cut-off of 3 gaps to trim the alignment.
# i.e. if there are more than 3 gaps at that position, we trim that position out.

coi_df = pd.DataFrame([s for s in coi_aligned])
index = [s.id for s in coi_aligned]

for i in range(coi_aligned.get_alignment_length()):
    num_gaps = Counter(coi_aligned[:,i])['-']
    if num_gaps > 3:
        coi_df = coi_df.drop(i, axis=1)
coi_df.index = index
coi_df


Out[8]:
111 112 113 114 115 116 117 118 119 120 ... 736 737 738 739 740 741 742 743 744 745
Human.Homo_sapiens T T G G T T C G G G ... T C G T G A T G T C
Kelp_Gull.Larus_dominicanus A T G G C C C C A A ... A A C A C T A A C C
Baikal_Teal.Anas_formosa A T G G C C C C A A ... A A C C C T A A C C
Redhead.Aythya_americana - C A A C C T C A - ... C G A G G C A T T C
Ferret.Mustela_putorius A T C C G T G C T G ... A C A C T T A T T T
Sloth_Bear.Melursus_ursinus A T T C G T G C C G ... A C A C T T G T T C
Panda.Ailuropoda_melanoleuca A T C C G T G C T G ... A C A T C T A T T T
Camel.Camelus_dromedarius A T T C G T G C T G ... A C A C C T A T T T
Horse.Equus_ferus_caballus A T C C G T G C T G ... A C A C C T A T T C
Domestic_Cat.Felis_catus - T C C G G G C C G ... A C A C T T A T T C
Swine.Sus_scrofa A T T C G C G C T G ... A C A C T T G T T C
Turkey.Meleagris_gallopavo A T C C G T G C A G ... A C A C C T A T T T
Japanese_Quail.Coturnix_japonica A T C C G C G C A G ... A C A C T T A T T T
Chicken.Gallus_gallus - - - - - - - - - - ... A C A C C T A T T C
Chinese_Francolin.Francolinus_pintadeanus A T C C G C G C A G ... A C A C C T A T T C
Garganey.Anas_querquedula A T C C G C G C A G ... A C A C C T A T T T
Northern_Shoveler.Anas_clypeata A T C C G C G C A G ... A C A C C T A T T T
Blue-Winged_Teal.Anas_discors A T C C G C G C A G ... A C A C C T A - - -
Cinnamon_Teal.Anas_cyanoptera A T C C G C G C A G ... A C A C C T A - - -
American_Green-Winged_Teal.Anas_carolinensis A T C C G C G C A G ... G C A C C T A T T T
Green-Winged_Teal.Anas_carolinensis A T C C G C G C A G ... G C A C C T A T T T
Mallard.Anas_platyrhynchos A T C C G G G C A G ... A C A C C T A T T T
American_Black_Duck.Anas_rubripes A T C C G G G C A G ... G C A C C T A T T T
American_Wigeon.Anas_americana A T C C G C G C A G ... A C A C C T A T T T
Gadwall.Anas_strepera A T C C G C G C A G ... A C A C C T A T T T
Red-Crested_Pochard.Netta_rufina A T C C G C G C A G ... A C A T C T A T T C
Hooded_Merganser.Lophodytes_cucullatus A T C C G C G C A G ... A C A C C T G T T C
Rosy-Billed_Pochard.Netta_peposaca A T C C G C G C A G ... A C A C C T A T T C
Ring-Necked_Duck.Aythya_collaris A T C C G C G C A G ... A C A C C T A T T C
Canvasback.Aythya_valisineria A T C C G C G C A G ... A C A C C T A T T C
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Snow_Goose.Chen_caerulescens A T C C G C G C A G ... A C A C C T A T T C
Pink-Footed_Goose.Anser_brachyrhynchus A T C C G C G C A G ... A C A C C T A T T C
Bar-Headed_Goose.Anser_indicus A T C C G C G C A G ... A C A C C T A T T C
Canada_Goose.Branta_canadensis A T C C G C G C A G ... G C A C C T A T T C
Barnacle_Goose.Branta_leucopsis A T C C G C G C A G ... A C A C C T A T T C
Whiskered_Tern.Chlidonias_hybrida A T T C G T G C A G ... A C A T C T A T T C
Common_Murre.Uria_aalge A T C C G T G C A G ... A C A C C T C T T C
Brown-Headed_Gull.Larus_brunnicephalus A T T C G T G C A G ... A C A C C T C T T C
Laughing_Gull.Larus_atricilla A T T C G T G C A G ... A C A C C T C T T C
Glaucous_Gull.Larus_hyperboreus A T C C G T G C A G ... A C A C C T C T T C
Ring-Billed_Gull.Larus_delawarensis A T C C G T G C A G ... A C A C C T C T T C
Herring_Gull.Larus_smithsonianus A T C C G T G C A G ... A C A C C T C - - -
Iceland_Gull.Larus_glaucoides A T C C G T G C A G ... A C A C C T C T T T
Ostrich.Struthio_camelus A T T C G T G C A G ... A C A C C T C T T C
Peregrine_Falcon.Falco_peregrinus A T T C G A A C A G ... A C A C C T A T T C
Saker_Falcon.Falco_cherrug A T T C G A A C A G ... A C A C T T A T T C
Ruddy_Turnstone.Arenaria_interpres A T T C G C G C A G ... A C A T C T C T T C
Sharp-Tailed_Sandpiper.Calidris_acuminata A T T C G T G C A G ... G C A C C T G T T C
Red-Necked_Stint.Calidris_ruficollis A T C C G T G C A G ... A C A C C T T T T C
Sanderling.Calidris_alba A T T C G A G C A G ... A C A T C T C T T C
White-Rumped_Sandpiper.Calidris_fuscicollis A T T C G T G C A G ... A C A T C T C T T C
Semipalmated_Sandpiper.Calidris_pusilla A T T C G T G C A G ... A C A T C T C T T C
Least_Sandpiper.Calidris_minutilla A T T C G T G C A G ... G C A C C T C T T C
Dunlin.Calidris_alpina A T T C G T G C A G ... A C A T C T C T T C
Red_Knot.Calidris_canutus A T T C G C G C A G ... A C A C C T C T T C
Little_Egret.Egretta_garzetta A T C C G A G C T G ... A C A C C T A T T C
Magpie.Anseranas_semipalmata A T C C G C G C A G ... G C A C C T C T T C
Grey_Heron.Ardea_cinerea A T C C G A G C T G ... A C A T C T C T T C
Great_Crested_Grebe.Podiceps_cristatus A T C C G C G C A G ... A C A C C T C T T C
Little_Grebe.Tachybaptus_ruficollis A T C C G T G C A G ... A C A T C T C T T C

74 rows × 580 columns


In [9]:
# Now, I have to concatenate the sequences back into a single string.
trimmed_coi = []
for host_name, letters in coi_df.iterrows():
    sequence = ''
    for letter in coi_df.ix[host_name]:
        sequence += letter
    seq = Seq(sequence)
    seqrecord = SeqRecord(seq, description='', id=host_name, name='')
    trimmed_coi.append(seqrecord)
SeqIO.write(trimmed_coi, 'host_coi_trimmed.fasta', 'fasta')


Out[9]:
74

In [10]:
trimmed_coi


Out[10]:
[SeqRecord(seq=Seq('TTGGTTCGGGGTATGG-----------------------GGTTAGCAGCGGTGT...GTC', Alphabet()), id='Human.Homo_sapiens', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGGCCCCAAATCTACGAAAATCTCACCCCCTCCTCAAAATAGTTAACAACTCA...ACC', Alphabet()), id='Kelp_Gull.Larus_dominicanus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGGCCCCAAATATCCGCAAATCCCACCCCCTACTAAAAATAATCAACAACTCC...ACC', Alphabet()), id='Baikal_Teal.Anas_formosa', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('-CAACCTCA---------------GGACTAATCATATGATTCCACTATAACTCA...TTC', Alphabet()), id='Redhead.Aythya_americana', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCTGAACTAGGTCAACCTGGCACTCTGCTAGGAGACGACCAGATTTAT...TTT', Alphabet()), id='Ferret.Mustela_putorius', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCCGAACTAGGTCAACCCGGGGCTCTGTTGGGGGATGATCAGATCTAC...TTC', Alphabet()), id='Sloth_Bear.Melursus_ursinus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCTGAATTAGGTCAGCCTGGAGCTCTGTTAGGAGATGACCAAATTTAC...TTT', Alphabet()), id='Panda.Ailuropoda_melanoleuca', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCTGAATTGGGGCAGCCTGGGACATTGCTTGGAGATGACCAAATCTAT...TTT', Alphabet()), id='Camel.Camelus_dromedarius', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCTGAATTAGGCCAACCTGGGACCCTACTAGGAGATGATCAGATCTAC...TTC', Alphabet()), id='Horse.Equus_ferus_caballus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('-TCCGGGCCGAACTGGGCCAACCTGGTACACTACTAGGAGATGATCAGATTTAC...TTC', Alphabet()), id='Domestic_Cat.Felis_catus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGCGCTGAACTAGGTCAGCCCGGAACCCTACTTGGCGATGATCAAATCTAC...TTC', Alphabet()), id='Swine.Sus_scrofa', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTGGGACAACCTGGGACACTCCTAGGAGACGACCAAATCTAT...TTT', Alphabet()), id='Turkey.Meleagris_gallopavo', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGCACCCTCCTAGGAGATGACCAAATTTAC...TTT', Alphabet()), id='Japanese_Quail.Coturnix_japonica', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('------------------------------------------------------...TTC', Alphabet()), id='Chicken.Gallus_gallus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGGCAACCCGGAACCCTCTTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='Chinese_Francolin.Francolinus_pintadeanus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTTGGTCAACCAGGAACCCTCCTGGGCGATGACCAAATTTAC...TTT', Alphabet()), id='Garganey.Anas_querquedula', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTTGGTCAACCAGGGACTCTCCTGGGCGATGACCAAATTTAC...TTT', Alphabet()), id='Northern_Shoveler.Anas_clypeata', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAGCTTGGTCAACCCGGGACTCTCCTGGGCGATGACCAAATTTAC...---', Alphabet()), id='Blue-Winged_Teal.Anas_discors', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAGCTTGGTCAACCCGGGACTCTCCTGGGCGATGACCAAATTTAC...---', Alphabet()), id='Cinnamon_Teal.Anas_cyanoptera', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTGGGCGACGACCAAATTTAC...TTT', Alphabet()), id='American_Green-Winged_Teal.Anas_carolinensis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTGGGCGACGACCAAATTTAC...TTT', Alphabet()), id='Green-Winged_Teal.Anas_carolinensis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGGGCAGAACTAGGCCAGCCAGGGACCCTCCTGGGCGACGACCAAATTTAT...TTT', Alphabet()), id='Mallard.Anas_platyrhynchos', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGGGCAGAGCTAGGCCAGCCAGGGACCCTCCTGGGCGACGACCAAATTTAT...TTT', Alphabet()), id='American_Black_Duck.Anas_rubripes', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGGACCCTCCTGGGCGACGACCAAATTTAC...TTT', Alphabet()), id='American_Wigeon.Anas_americana', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAGCCAGGGACCCTCCTGGGCGACGACCAAATTTAT...TTT', Alphabet()), id='Gadwall.Anas_strepera', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATCTAC...TTC', Alphabet()), id='Red-Crested_Pochard.Netta_rufina', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAACCAGGGACCCTCCTAGGCGATGACCAAATTTAC...TTC', Alphabet()), id='Hooded_Merganser.Lophodytes_cucullatus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCGGGAACCCTCCTGGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Rosy-Billed_Pochard.Netta_peposaca', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Ring-Necked_Duck.Aythya_collaris', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Canvasback.Aythya_valisineria', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTGGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Tufted_Duck.Aythya_fuligula', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Lesser_Scaup.Aythya_affinis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Greater_Scaup.Aythya_marila', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCTGAACTAGGCCAGCCAGGAACCCTCCTAGGTGATGACCAAATTTAT...TTC', Alphabet()), id='Wood_Duck.Aix_sponsa', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAGCCGGGAACCCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Black_Scoter.Melanitta_nigra', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAACCAGGGACCCTCCTGGGCGATGACCAAATTTAC...TTC', Alphabet()), id='Bufflehead.Bucephala_albeola', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAGCCAGGGACCCTCCTGGGCGATGACCAAATTTAC...TTC', Alphabet()), id='Common_Goldeneye.Bucephala_clangula', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCCGGAACCCTCCTAGGTGATGACCAAATTTAC...TTT', Alphabet()), id='Common_Eider.Somateria_mollissima', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGCCAACCAGGAACCCTCCTGGGTGATGACCAAATTTAC...TTC', Alphabet()), id='Long-Tailed_Duck.Clangula_hyemalis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAGCCAGGAACCCTCCTCGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Mute_Swan.Cygnus_olor', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGACAACCGGGAACTCTCCTTGGTGACGACCAGATCTAT...TTC', Alphabet()), id='Tundra_Swan.Cygnus_columbianus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGACAACCAGGAACCCTCCTTGGTGACGACCAGATCTAC...TTC', Alphabet()), id='Whooper_Swan.Cygnus_cygnus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGCGCAGAACTAGGCCAACCAGGGACTCTCCTGGGAGATGACCAAATCTAT...TTC', Alphabet()), id='Muscovy_Duck.Cairina_moschata', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAGCCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Greater_White-Fronted_Goose.Anser_albifrons', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Snow_Goose.Chen_caerulescens', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Pink-Footed_Goose.Anser_brachyrhynchus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Bar-Headed_Goose.Anser_indicus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGGACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Canada_Goose.Branta_canadensis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGGACTCTCCTAGGTGACGACCAAATTTAC...TTC', Alphabet()), id='Barnacle_Goose.Branta_leucopsis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGCCAACCAGGAACCCTCCTAGGAGATGATCAAATCTAC...TTC', Alphabet()), id='Whiskered_Tern.Chlidonias_hybrida', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTAGGCCAACCAGGGACCCTCCTAGGAGATGACCAAATCTAT...TTC', Alphabet()), id='Common_Murre.Uria_aalge', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGGGACGACCAAATCTAT...TTC', Alphabet()), id='Brown-Headed_Gull.Larus_brunnicephalus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGATCAAATCTAT...TTC', Alphabet()), id='Laughing_Gull.Larus_atricilla', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGACCAAATCTAT...TTC', Alphabet()), id='Glaucous_Gull.Larus_hyperboreus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGGACCCTCCTAGGAGACGACCAAATCTAT...TTC', Alphabet()), id='Ring-Billed_Gull.Larus_delawarensis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGACCAAATCTAT...---', Alphabet()), id='Herring_Gull.Larus_smithsonianus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGACCAAATCTAT...TTT', Alphabet()), id='Iceland_Gull.Larus_glaucoides', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAATTAGGACAACCAGGGACACTACTTGGAGACGATCAAATCTAC...TTC', Alphabet()), id='Ostrich.Struthio_camelus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGAACAGAACTTGGCCAACCAGGAACTCTCCTAGGAGATGACCAAATCTAC...TTC', Alphabet()), id='Peregrine_Falcon.Falco_peregrinus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGAACAGAACTTGGCCAACCAGGGACTCTCCTAGGAGATGACCAAATCTAC...TTC', Alphabet()), id='Saker_Falcon.Falco_cherrug', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGCGCAGAACTAGGTCAACCGGGGACCCTCTTAGGAGACGATCAAATTTAC...TTC', Alphabet()), id='Ruddy_Turnstone.Arenaria_interpres', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGCCAACCCGGAACTCTCTTAGGAGACGATCAAATCTAT...TTC', Alphabet()), id='Sharp-Tailed_Sandpiper.Calidris_acuminata', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTAGGCCAGCCCGGAACCCTTCTAGGAGATGACCAAATCTAT...TTC', Alphabet()), id='Red-Necked_Stint.Calidris_ruficollis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGAGCAGAACTAGGTCAACCCGGGACCCTCCTAGGAGATGATCAAATCTAC...TTC', Alphabet()), id='Sanderling.Calidris_alba', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCTGGAACTCTTTTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='White-Rumped_Sandpiper.Calidris_fuscicollis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCCGGAACCCTTTTAGGAGATGACCAGATCTAC...TTC', Alphabet()), id='Semipalmated_Sandpiper.Calidris_pusilla', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCCGGAACCCTTTTAGGAGATGACCAAATCTAC...TTC', Alphabet()), id='Least_Sandpiper.Calidris_minutilla', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCTGGGACTCTTTTAGGAGATGACCAAATTTAC...TTC', Alphabet()), id='Dunlin.Calidris_alpina', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGCGCAGAACTAGGCCAACCCGGAACCCTCTTAGGAGATGACCAAATTTAT...TTC', Alphabet()), id='Red_Knot.Calidris_canutus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGAGCTGAACTTGGCCAGCCAGGAACGCTCCTAGGAGACGACCAGATCTAT...TTC', Alphabet()), id='Little_Egret.Egretta_garzetta', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACCCTCCTAGGCGATGACCAAATCTAT...TTC', Alphabet()), id='Magpie.Anseranas_semipalmata', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGAGCTGAACTTGGACAACCAGGGACGCTCCTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='Grey_Heron.Ardea_cinerea', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGCCAGCCAGGAACCCTCCTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='Great_Crested_Grebe.Podiceps_cristatus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTAGGCCAACCAGGAACCCTTCTAGGAGACGACCAGATCTAC...TTC', Alphabet()), id='Little_Grebe.Tachybaptus_ruficollis', name='', description='', dbxrefs=[])]

In [11]:
# Filter out COI sequences such that only those without gaps are left. This is so that we can do a phylogenetic tree.
no_gaps = []
for s in trimmed_coi:
    if '-' not in s.seq:
        no_gaps.append(s)
no_gaps


Out[11]:
[SeqRecord(seq=Seq('ATCCGTGCTGAACTAGGTCAACCTGGCACTCTGCTAGGAGACGACCAGATTTAT...TTT', Alphabet()), id='Ferret.Mustela_putorius', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCCGAACTAGGTCAACCCGGGGCTCTGTTGGGGGATGATCAGATCTAC...TTC', Alphabet()), id='Sloth_Bear.Melursus_ursinus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCTGAATTAGGTCAGCCTGGAGCTCTGTTAGGAGATGACCAAATTTAC...TTT', Alphabet()), id='Panda.Ailuropoda_melanoleuca', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCTGAATTGGGGCAGCCTGGGACATTGCTTGGAGATGACCAAATCTAT...TTT', Alphabet()), id='Camel.Camelus_dromedarius', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCTGAATTAGGCCAACCTGGGACCCTACTAGGAGATGATCAGATCTAC...TTC', Alphabet()), id='Horse.Equus_ferus_caballus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGCGCTGAACTAGGTCAGCCCGGAACCCTACTTGGCGATGATCAAATCTAC...TTC', Alphabet()), id='Swine.Sus_scrofa', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTGGGACAACCTGGGACACTCCTAGGAGACGACCAAATCTAT...TTT', Alphabet()), id='Turkey.Meleagris_gallopavo', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGCACCCTCCTAGGAGATGACCAAATTTAC...TTT', Alphabet()), id='Japanese_Quail.Coturnix_japonica', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGGCAACCCGGAACCCTCTTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='Chinese_Francolin.Francolinus_pintadeanus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTTGGTCAACCAGGAACCCTCCTGGGCGATGACCAAATTTAC...TTT', Alphabet()), id='Garganey.Anas_querquedula', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTTGGTCAACCAGGGACTCTCCTGGGCGATGACCAAATTTAC...TTT', Alphabet()), id='Northern_Shoveler.Anas_clypeata', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTGGGCGACGACCAAATTTAC...TTT', Alphabet()), id='American_Green-Winged_Teal.Anas_carolinensis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTGGGCGACGACCAAATTTAC...TTT', Alphabet()), id='Green-Winged_Teal.Anas_carolinensis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGGGCAGAACTAGGCCAGCCAGGGACCCTCCTGGGCGACGACCAAATTTAT...TTT', Alphabet()), id='Mallard.Anas_platyrhynchos', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGGGCAGAGCTAGGCCAGCCAGGGACCCTCCTGGGCGACGACCAAATTTAT...TTT', Alphabet()), id='American_Black_Duck.Anas_rubripes', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGGACCCTCCTGGGCGACGACCAAATTTAC...TTT', Alphabet()), id='American_Wigeon.Anas_americana', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAGCCAGGGACCCTCCTGGGCGACGACCAAATTTAT...TTT', Alphabet()), id='Gadwall.Anas_strepera', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATCTAC...TTC', Alphabet()), id='Red-Crested_Pochard.Netta_rufina', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAACCAGGGACCCTCCTAGGCGATGACCAAATTTAC...TTC', Alphabet()), id='Hooded_Merganser.Lophodytes_cucullatus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCGGGAACCCTCCTGGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Rosy-Billed_Pochard.Netta_peposaca', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Ring-Necked_Duck.Aythya_collaris', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Canvasback.Aythya_valisineria', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTGGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Tufted_Duck.Aythya_fuligula', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Lesser_Scaup.Aythya_affinis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Greater_Scaup.Aythya_marila', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCTGAACTAGGCCAGCCAGGAACCCTCCTAGGTGATGACCAAATTTAT...TTC', Alphabet()), id='Wood_Duck.Aix_sponsa', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAGCCGGGAACCCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Black_Scoter.Melanitta_nigra', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAACCAGGGACCCTCCTGGGCGATGACCAAATTTAC...TTC', Alphabet()), id='Bufflehead.Bucephala_albeola', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAGCCAGGGACCCTCCTGGGCGATGACCAAATTTAC...TTC', Alphabet()), id='Common_Goldeneye.Bucephala_clangula', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCCGGAACCCTCCTAGGTGATGACCAAATTTAC...TTT', Alphabet()), id='Common_Eider.Somateria_mollissima', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGCCAACCAGGAACCCTCCTGGGTGATGACCAAATTTAC...TTC', Alphabet()), id='Long-Tailed_Duck.Clangula_hyemalis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAGCCAGGAACCCTCCTCGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Mute_Swan.Cygnus_olor', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGACAACCGGGAACTCTCCTTGGTGACGACCAGATCTAT...TTC', Alphabet()), id='Tundra_Swan.Cygnus_columbianus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGACAACCAGGAACCCTCCTTGGTGACGACCAGATCTAC...TTC', Alphabet()), id='Whooper_Swan.Cygnus_cygnus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGCGCAGAACTAGGCCAACCAGGGACTCTCCTGGGAGATGACCAAATCTAT...TTC', Alphabet()), id='Muscovy_Duck.Cairina_moschata', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAGCCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Greater_White-Fronted_Goose.Anser_albifrons', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Snow_Goose.Chen_caerulescens', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Pink-Footed_Goose.Anser_brachyrhynchus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Bar-Headed_Goose.Anser_indicus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGGACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Canada_Goose.Branta_canadensis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGGACTCTCCTAGGTGACGACCAAATTTAC...TTC', Alphabet()), id='Barnacle_Goose.Branta_leucopsis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGCCAACCAGGAACCCTCCTAGGAGATGATCAAATCTAC...TTC', Alphabet()), id='Whiskered_Tern.Chlidonias_hybrida', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTAGGCCAACCAGGGACCCTCCTAGGAGATGACCAAATCTAT...TTC', Alphabet()), id='Common_Murre.Uria_aalge', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGGGACGACCAAATCTAT...TTC', Alphabet()), id='Brown-Headed_Gull.Larus_brunnicephalus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGATCAAATCTAT...TTC', Alphabet()), id='Laughing_Gull.Larus_atricilla', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGACCAAATCTAT...TTC', Alphabet()), id='Glaucous_Gull.Larus_hyperboreus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGGACCCTCCTAGGAGACGACCAAATCTAT...TTC', Alphabet()), id='Ring-Billed_Gull.Larus_delawarensis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGACCAAATCTAT...TTT', Alphabet()), id='Iceland_Gull.Larus_glaucoides', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAATTAGGACAACCAGGGACACTACTTGGAGACGATCAAATCTAC...TTC', Alphabet()), id='Ostrich.Struthio_camelus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGAACAGAACTTGGCCAACCAGGAACTCTCCTAGGAGATGACCAAATCTAC...TTC', Alphabet()), id='Peregrine_Falcon.Falco_peregrinus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGAACAGAACTTGGCCAACCAGGGACTCTCCTAGGAGATGACCAAATCTAC...TTC', Alphabet()), id='Saker_Falcon.Falco_cherrug', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGCGCAGAACTAGGTCAACCGGGGACCCTCTTAGGAGACGATCAAATTTAC...TTC', Alphabet()), id='Ruddy_Turnstone.Arenaria_interpres', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGCCAACCCGGAACTCTCTTAGGAGACGATCAAATCTAT...TTC', Alphabet()), id='Sharp-Tailed_Sandpiper.Calidris_acuminata', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTAGGCCAGCCCGGAACCCTTCTAGGAGATGACCAAATCTAT...TTC', Alphabet()), id='Red-Necked_Stint.Calidris_ruficollis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGAGCAGAACTAGGTCAACCCGGGACCCTCCTAGGAGATGATCAAATCTAC...TTC', Alphabet()), id='Sanderling.Calidris_alba', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCTGGAACTCTTTTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='White-Rumped_Sandpiper.Calidris_fuscicollis', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCCGGAACCCTTTTAGGAGATGACCAGATCTAC...TTC', Alphabet()), id='Semipalmated_Sandpiper.Calidris_pusilla', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCCGGAACCCTTTTAGGAGATGACCAAATCTAC...TTC', Alphabet()), id='Least_Sandpiper.Calidris_minutilla', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCTGGGACTCTTTTAGGAGATGACCAAATTTAC...TTC', Alphabet()), id='Dunlin.Calidris_alpina', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTCGCGCAGAACTAGGCCAACCCGGAACCCTCTTAGGAGATGACCAAATTTAT...TTC', Alphabet()), id='Red_Knot.Calidris_canutus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGAGCTGAACTTGGCCAGCCAGGAACGCTCCTAGGAGACGACCAGATCTAT...TTC', Alphabet()), id='Little_Egret.Egretta_garzetta', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACCCTCCTAGGCGATGACCAAATCTAT...TTC', Alphabet()), id='Magpie.Anseranas_semipalmata', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGAGCTGAACTTGGACAACCAGGGACGCTCCTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='Grey_Heron.Ardea_cinerea', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGCCAGCCAGGAACCCTCCTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='Great_Crested_Grebe.Podiceps_cristatus', name='', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCCGTGCAGAACTAGGCCAACCAGGAACCCTTCTAGGAGACGACCAGATCTAC...TTC', Alphabet()), id='Little_Grebe.Tachybaptus_ruficollis', name='', description='', dbxrefs=[])]

In [12]:
SeqIO.write(no_gaps, 'host_coi_nogaps.fasta', 'fasta')
SeqIO.write(no_gaps, 'host_coi_nogaps.phylip', 'phylip')


Out[12]:
65

In [13]:
# Get the distribution of hamming distances.
from itertools import combinations
from Levenshtein import distance
distances = []
for s1, s2 in combinations(no_gaps, 2):
    s1 = str(s1.seq)
    s2 = str(s2.seq)
    distances.append(distance(s1,s2))

In [14]:
plt.hist(distances)


Out[14]:
(array([  23.,   14.,   38.,  232.,  225.,  193.,  692.,  334.,  239.,   90.]),
 array([   0. ,   14.9,   29.8,   44.7,   59.6,   74.5,   89.4,  104.3,
         119.2,  134.1,  149. ]),
 <a list of 10 Patch objects>)

Construct Phylogenetic Tree


In [15]:
!! raxmlHPC -p 100 -# 3 -m GTRGAMMA -s host_coi_nogaps.fasta -n host_coi_nogaps.tree -T 2


Out[15]:
['Option -T does not have any effect with the sequential or parallel MPI version.',
 'It is used to specify the number of threads for the Pthreads-based parallelization',
 '',
 "RAxML can't, parse the alignment file as phylip file ",
 'it will now try to parse it as FASTA file',
 '',
 'RAxML output files with the run ID <host_coi_nogaps.tree> already exist ',
 'in directory /home/ericmjl/influenza-global-reassortment/ ...... exiting']

In [16]:
!! ls *.tree


Out[16]:
['RAxML_bestTree.host_coi_nogaps.tree', 'RAxML_info.host_coi_nogaps.tree']

Compute pairwise patristic distances in the phylogenetic tree


In [26]:
from dendropy import Tree
from dendropy.calculate.treemeasure import PatristicDistanceMatrix
coi_tree = Tree.get(file=open('RAxML_bestTree.host_coi_nogaps.tree', 'r'),
                    schema='newick')


coi_pds = PatristicDistanceMatrix(coi_tree)
coi_pds.sum_of_distances()


Out[26]:
1015.7242854563826

In [31]:
taxon1 = coi_tree.leaf_nodes()[0].taxon
taxon2 = coi_tree.leaf_nodes()[1].taxon

In [50]:
taxon2 = coi_tree.leaf_nodes()[1].taxon.__str__()
taxon2.replace("'","")


Out[50]:
'Horse.Equus ferus caballus'

In [34]:
coi_pds.__call__(taxon1, taxon2)


Out[34]:
0.7119165546950544

In [67]:
patristic_distances = nx.Graph()
pds = []
for taxon1, taxon2 in product(coi_tree.leaf_nodes(), coi_tree.leaf_nodes()):
    taxon1 = taxon1.taxon
    taxon2 = taxon2.taxon
    
    pd = coi_pds.__call__(taxon1, taxon2)
    t1 = taxon1.__str__().replace("'","").split('.')[0]
    t2 = taxon2.__str__().replace("'","").split('.')[0]
    patristic_distances.add_edge(t1, t2, pd=pd)
    pds.append(pd)

In [68]:
plt.hist(pds)


Out[68]:
(array([ 403.,  782.,  364.,  896.,  990.,  106.,   88.,  162.,  354.,   80.]),
 array([ 0.        ,  0.12690987,  0.25381974,  0.38072961,  0.50763948,
         0.63454935,  0.76145921,  0.88836908,  1.01527895,  1.14218882,
         1.26909869]),
 <a list of 10 Patch objects>)

In [69]:
max(pds)


Out[69]:
1.2690986907853712

In [70]:
min(pds)


Out[70]:
0.0

In [71]:
nx.write_gpickle(patristic_distances, 'supp_data/patristic_distances.pkl')

In [73]:
patristic_distances.edge['Mallard']


Out[73]:
{'American Black Duck': {'pd': 0.00978530479098707},
 'American Green-Winged Teal': {'pd': 0.0760159068606335},
 'American Wigeon': {'pd': 0.05329789418179376},
 'Bar-Headed Goose': {'pd': 0.16440706032963423},
 'Barnacle Goose': {'pd': 0.1525078793951098},
 'Black Scoter': {'pd': 0.1313820887207964},
 'Brown-Headed Gull': {'pd': 0.5359870461849322},
 'Bufflehead': {'pd': 0.13844900323749146},
 'Camel': {'pd': 1.139610961267169},
 'Canada Goose': {'pd': 0.1487864476189279},
 'Canvasback': {'pd': 0.12777814104578533},
 'Chinese Francolin': {'pd': 0.41679378743034057},
 'Common Eider': {'pd': 0.12320986574712678},
 'Common Goldeneye': {'pd': 0.1250179531566235},
 'Common Murre': {'pd': 0.5037977010375587},
 'Dunlin': {'pd': 0.5541468716317777},
 'Ferret': {'pd': 1.0052196033816883},
 'Gadwall': {'pd': 0.06077763709949219},
 'Garganey': {'pd': 0.09094589474066593},
 'Glaucous Gull': {'pd': 0.5522423313203503},
 'Great Crested Grebe': {'pd': 0.4322418927584293},
 'Greater Scaup': {'pd': 0.12209641915797556},
 'Greater White-Fronted Goose': {'pd': 0.16375245824132825},
 'Green-Winged Teal': {'pd': 0.0760159068606335},
 'Grey Heron': {'pd': 0.4983191919597433},
 'Hooded Merganser': {'pd': 0.15361574839833744},
 'Horse': {'pd': 1.0398314617667486},
 'Iceland Gull': {'pd': 0.5585970770247295},
 'Japanese Quail': {'pd': 0.4123314279477942},
 'Laughing Gull': {'pd': 0.5538684305736757},
 'Least Sandpiper': {'pd': 0.5538078786053702},
 'Lesser Scaup': {'pd': 0.12529268690501263},
 'Little Egret': {'pd': 0.45458216513256133},
 'Little Grebe': {'pd': 0.435422329525462},
 'Long-Tailed Duck': {'pd': 0.13113589994386673},
 'Magpie': {'pd': 0.5184911095605531},
 'Mallard': {'pd': 0.0},
 'Muscovy Duck': {'pd': 0.17436409315181028},
 'Mute Swan': {'pd': 0.1909289111215217},
 'Northern Shoveler': {'pd': 0.0944879302701608},
 'Ostrich': {'pd': 0.6269793620310405},
 'Panda': {'pd': 1.0143060249254658},
 'Peregrine Falcon': {'pd': 0.5833636672043966},
 'Pink-Footed Goose': {'pd': 0.16125627242943497},
 'Red Knot': {'pd': 0.514820128052132},
 'Red-Crested Pochard': {'pd': 0.19320986169215124},
 'Red-Necked Stint': {'pd': 0.5528505340557295},
 'Ring-Billed Gull': {'pd': 0.5619518993833915},
 'Ring-Necked Duck': {'pd': 0.12738246219395677},
 'Rosy-Billed Pochard': {'pd': 0.13739837307773234},
 'Ruddy Turnstone': {'pd': 0.49730290885553485},
 'Saker Falcon': {'pd': 0.6160120296332262},
 'Sanderling': {'pd': 0.556583392075901},
 'Semipalmated Sandpiper': {'pd': 0.5321150884894232},
 'Sharp-Tailed Sandpiper': {'pd': 0.5191468062037256},
 'Sloth Bear': {'pd': 1.0620796615274277},
 'Snow Goose': {'pd': 0.1604615363043253},
 'Swine': {'pd': 0.7635753681642896},
 'Tufted Duck': {'pd': 0.12898171178406181},
 'Tundra Swan': {'pd': 0.19277471442754854},
 'Turkey': {'pd': 0.4339315353387678},
 'Whiskered Tern': {'pd': 0.5655726881069701},
 'White-Rumped Sandpiper': {'pd': 0.5635020165658157},
 'Whooper Swan': {'pd': 0.18976871641150603},
 'Wood Duck': {'pd': 0.16351074763269205}}

In [ ]: